## "China's Ideological Spectrum"
## by Jennifer Pan and Yiqing Xu

### Install Packages
## install.packages("psych")

#############################
## PCA analyses
#############################

rm(list=ls(all=TRUE))
load("data/sample10K.RData")
questions <- paste("q", 1:50, sep = "")

## run PCA
pca.out<-prcomp(d[,questions], center=TRUE,scale=TRUE)
names(pca.out)
summary(pca.out)

## get PC
d$PC1 <- pca.out$x[,1] * -1 
d$PC2 <- pca.out$x[,2]
d$PC3 <- pca.out$x[,3]
ord <- order(d$PC1) # increasing order

## Rotation matrix
RMatrix <- pca.out$rotation
RMatrix[,1] <- RMatrix[,1] * -1
for (i in 1:50) {RMatrix[,i] <- RMatrix[,i] * sd(pca.out$x[,i])}
save(RMatrix, questions, ord, pca.out, file="output/result_pca.RData")

#################################################
## Figure 4 (left). PCA and EFA Dimentionality
#################################################

rm(list=ls(all=TRUE))
load("data/sample10K.RData")
load("output/result_pca.RData")

library(psych)
out <- scree(d[,questions])
fv <- out$fv
pcv <- out$pcv

pdf("graphs/pca_scree.pdf")
par(mar = c(5, 4, 1, 1))
plot(1, type = "n", ylim = c(-0.5, 10), xlim = c(1,50),
     xlab = "", ylab = "", axes = FALSE)
mtext("i'th Principal Component or Factor", 1, 2.5, cex = 1.5)
mtext("Eigen Values of Components and Factors", 2, 2.5, cex = 1.5)
abline(h = 1, col = "#AAAAAA50", lwd = 4, lty = 1)
abline(h = seq(-1,20,0.5), col = "#AAAAAA50")
abline(v = seq(0,50,by=2), col = "#AAAAAA50")
lines(fv, col = "gray30", lwd = 2, lty = 3)
points(fv, pch = 16, col = "white", cex = 1.8)
lines(pcv, col = 1, lwd = 2)
points(pcv, pch = 16, col ="white", cex = 1.8) 
points(fv, col = "gray30", pch = 1, cex = 1.1)
points(pcv, pch = 16, col =1, cex = 1.2)
axis(1, at = seq(0,50,10), cex.axis = 1.3)
axis(2, at = 0:10, cex.axis = 1.3)
legend("topright", c("Principal Component", "Factor"), pch = c(16, 1),
       col = c(1, "gray30"), bty = "n", cex = 1.5)
box()
graphics.off()


#########################################################
## Figure 4 (right). Dimensionality: based on prediction
#########################################################

rm(list=ls(all=TRUE))
load("data/sample10K.RData")
load("output/result_pca.RData")

## PCP with more and more PCs
## standardize. we know: x = s %*% R; x %*% W = s
d <- d[,questions]
choice<-ifelse(d>=2.5,1,0)
s<-apply(d,2,function(vec){vec<-(vec-mean(vec))/sd(vec)}) # a (N*50) matrix
R<-pca.out$rotation  # rotation matrix
W<-solve(R) # inverse
x<-pca.out$x  # PCs
s.mean<-pca.out$center
s.sd<-pca.out$sdev

maxdim<-50
result<-matrix(NA,50,(maxdim+1))
for (i in 0:maxdim) {
    if (i==0) {
        pred<-matrix(0,dim(s)[1],dim(s)[2])
    } else if (i==1) {
        pred<-as.matrix(x[,1])%*%matrix(W[1,],nrow=1)  
    } else {
        pred<-x[,1:i]%*%W[1:i,]  
    }
    pred.out<-pred
    for (j in 1:50){
        pred.out[,j]<-pred.out[,j]*s.sd[j]+s.mean[j]
    }
    choice.pred<-ifelse(pred.out>=2.5,1,0)
    choice.right<-choice.pred==choice  # (N*50)
    result[,(i+1)]<-apply(choice.right,2,mean)
    if (i%%10==0) cat(i) else cat(".")
}
result1 <- result
## predict independently
maxdim<-50
result<-matrix(NA,50,(maxdim+1))
for (i in 0:maxdim) {
    if (i==0) {
        pred<-matrix(0,dim(s)[1],dim(s)[2])
    } else {
        pred<-as.matrix(x[,i])%*%matrix(W[i,],nrow=1)  
    }
    pred.out<-pred
    for (j in 1:50){
        pred.out[,j]<-pred.out[,j]*s.sd[j]+s.mean[j]
    }
    choice.pred<-ifelse(pred.out>=2.5,1,0)
    choice.right<-choice.pred==choice  # (N*50)
    result[,(i+1)]<-apply(choice.right,2,mean)
    if (i%%10==0) cat(i) else cat(".")
}
result2 <- result
save(result1, result2, file="output/result_pca_prediction.RData")



## plotting
load("output/result_pca_prediction.RData")
pdf("graphs/pca_prediction.pdf")
par(mar=c(4.5,5,1,1), mfcol = c(2,1))
## collectively
rate<-apply(result1,2,mean) * 100;
round(rate[1:10], 0)
## increasing predictive power
## 61% -> 69% -> 71% -> 73% -> 74% -> 75% -> 76%-> 77%
plot(1, type = "n", xlim = c(0,50), ylim=c(58,103),xlab="",ylab="",axes=FALSE)
abline(h = seq(60,100,by=5), col = "#AAAAAA50")
abline(v = c(1,3,seq(0,50,by=2)), col = "#AAAAAA50")
lines(0:50,rate,lwd=2)
points(0:50,rate, pch = 16, col = "white", cex = 1.8)
points(0:50,rate, pch = 16, cex = 1.2)
mtext("Number of Principal Components",1,2.5,cex=1.2)
axis(1, at = c(1:3, seq(0,50,10)), cex.axis = 1.2)
axis(2, cex.axis = 1.2)
text(35, 80, "Percentage Correctly Predicted\n(Accumulative PCP) %",cex=1.2)
box()
## independently
par(mar=c(5,5,0.5,1))
rate<-apply(result2,2,mean)
power<-(rate[-1]-rate[1])*100
plot(1, type = "n", xlim=c(0,51),ylim=c(-1,9),xlab="", ylab="", axes = FALSE)
abline(h = seq(0,8,by=1), col = "#AAAAAA50")
abline(v = c(1,3, seq(0,50,by=2)), col = "#AAAAAA50")
lines(1:50,power,lwd=2)
points(1:50, power,pch=16, col = "white", cex = 1.8)
points(1:50, power, pch = 16, cex = 1.2)
abline(h=0,col="gray",lwd=2)
mtext("i'th Principal Component",1,2.5,cex=1.2)
axis(1, at = c(1:3, seq(0,50,10)), cex.axis = 1.2)
axis(2, cex.axis = 1.2)
text(35, 3,"Independent Predictive Power\n(Increase in PCP %)",cex=1.2)
box()
graphics.off()
